Filter criteria:
Filter differentially expressed genes between autism and control (p-value < 0.05)
No samples are removed based on network connectivity z-scores
glue('Number of genes: ', nrow(datExpr), '\n',
'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 3385
## Number of samples: 86 (51 ASD, 35 controls)
First principal component still explains over 90% of the total variance
reduce_dim_datExpr = function(datExpr, datMeta, var_explained=0.98){
datExpr = data.frame(datExpr)
datExpr_pca = prcomp(datExpr, scale.=TRUE)
par(mfrow=c(1,2))
plot(summary(datExpr_pca)$importance[2,], type='b')
plot(summary(datExpr_pca)$importance[3,], type='b')
abline(h=var_explained, col='blue')
last_pc = data.frame(summary(datExpr_pca)$importance[3,]) %>% rownames_to_column(var='ID') %>%
filter(.[[2]] >= var_explained) %>% top_n(-1, ID)
print(glue('Keeping top ', substr(last_pc$ID, 3, nchar(last_pc$ID)), ' components that explain ',
var_explained*100, '% of the variance'))
datExpr_top_pc = datExpr_pca$x %>% data.frame %>% dplyr::select(PC1:last_pc$ID)
return(list('datExpr'=datExpr_top_pc, 'pca_output'=datExpr_pca))
}
reduce_dim_output = reduce_dim_datExpr(datExpr, datMeta)
## Keeping top 11 components that explain 98% of the variance
datExpr_redDim = reduce_dim_output$datExpr
pca_output = reduce_dim_output$pca_output
rm(datSeq, datProbes, reduce_dim_datExpr, reduce_dim_output, datExpr)
Genes seem to have separated into two clouds of points:
datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2)) + geom_point() + theme_minimal()
Projecting all the original points into the space created by the two principal components and colouring by the differential expression p-value we can see that the points in the middle of the two clouds were filtered out because their DE wasn’t statistically significant. Colouring by their log2 fold change we can see that the genes from the cloud on the top are overexpressed and the genes in the bottom one underexpressed.
load('./../working_data/RNAseq_ASD_4region_normalized.Rdata')
# Remove Subject with ID = 'AN03345'
keep = datMeta$Subject_ID!='AN03345'
datMeta = datMeta[keep,]
datExpr = datExpr[,keep]
# Calculate differential expression p-value of each gene
mod = model.matrix(~ Dx, data=datMeta)
corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
lmfit = lmFit(datExpr, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)
fit = eBayes(lmfit, trend=T, robust=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr))
ordered_top_genes = top_genes[match(rownames(datExpr), rownames(top_genes)),]
ASD_pvals = ordered_top_genes$adj.P.Val
log_fold_change = ordered_top_genes$logFC
pca_data_projection = scale(datExpr) %*% pca_output$rotation %>% data.frame
p1 = pca_data_projection %>% ggplot(aes(x=PC1, y=PC2, color=ASD_pvals)) + geom_point(alpha=0.5) + theme_minimal()
p2 = pca_data_projection %>% ggplot(aes(x=PC1, y=PC2, color=log_fold_change)) + geom_point(alpha=0.5) + theme_minimal() +
scale_colour_gradient2()
grid.arrange(p1, p2, ncol=2)
rm(mod, corfit, lmfit, fit, ASD_pvals, top_genes, ordered_top_genes, datExpr, datProbes, datSeq)
clusterings = list()
set.seed(123)
wss = sapply(1:10, function(k) kmeans(datExpr_redDim, k, iter.max=100, nstart=25,
algorithm='MacQueen')$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 4
abline(v = best_k, col='blue')
datExpr_k_means = kmeans(datExpr_redDim, best_k, iter.max=100, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster
Chose k=6 as best number of clusters.
Clusters seem to be able to separate ASD and control samples pretty well and there are no noticeable patterns regarding sex, age or brain region in any cluster.
Younger ASD samples seem to be more similar to control samples than older ASD samples (pink cluster has most of the youngest samples). The yellow cluster is made of young ASD samples.
h_clusts = datExpr_redDim %>% dist %>% hclust
plot(h_clusts, hang = -1, cex = 0.6, labels=FALSE)
best_k = 6
clusterings[['hc']] = cutree(h_clusts, best_k)
Samples are grouped into two big clusters, two small clusters and two outliers, the first big cluster has no subclusters and the second one hsa three subclusters and two outlirs.
*Output plots in clustering_genes_03_19 folder
Following this paper’s guidelines:
Run PCA and keep enough components to explain 60% of the variance
Run ICA with that same number of nbComp as principal components kept to then filter them
Select components with kurtosis > 3
Assign genes to clusters with FDR<0.001 using the fdrtool package
ICA_output = datExpr_redDim %>% runICA(nbComp=ncol(datExpr_redDim), method='JADE')
signals_w_kurtosis = ICA_output$S %>% data.frame %>% dplyr::select(names(apply(ICA_output$S, 2, kurtosis)>3))
ICA_clusters = apply(signals_w_kurtosis, 2, function(x) fdrtool(x, plot=F)$qval<0.01) %>% data.frame
ICA_clusters = ICA_clusters[colSums(ICA_clusters)>1]
clusterings[['ICA_min']] = rep(NA, nrow(ICA_clusters))
ICA_clusters_names = colnames(ICA_clusters)
for(c in ICA_clusters_names) clusterings[['ICA_min']][ICA_clusters[,c]] = c
Leaves most of the observations (~78%) without a cluster:
ICA_clusters %>% rowSums %>% table
## .
## 0 1 2 3 4 5
## 2658 482 180 46 16 3
ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2,Var1)) +
geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') +
theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()
best_power=51 but blockwiseModules only accepts powers up to 30, so 30 was used instead
best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = seq(1, 60, by=3))
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 7.38e-01 2.2000 0.739 2470 2730 2860
## 2 4 2.19e-01 0.3960 0.916 1590 1890 2250
## 3 7 4.11e-02 0.1290 0.920 1220 1450 1950
## 4 10 3.20e-07 0.0003 0.929 994 1160 1740
## 5 13 2.64e-02 -0.0792 0.953 840 950 1590
## 6 16 9.12e-02 -0.1400 0.971 727 790 1460
## 7 19 1.83e-01 -0.1890 0.962 640 664 1360
## 8 22 2.75e-01 -0.2310 0.970 570 563 1270
## 9 25 3.68e-01 -0.2640 0.971 513 486 1200
## 10 28 4.56e-01 -0.2950 0.972 465 421 1140
## 11 31 5.41e-01 -0.3280 0.971 425 366 1080
## 12 34 6.19e-01 -0.3540 0.958 390 318 1030
## 13 37 6.85e-01 -0.3790 0.969 360 280 981
## 14 40 7.44e-01 -0.4000 0.953 333 250 939
## 15 43 7.86e-01 -0.4210 0.956 310 221 901
## 16 46 8.15e-01 -0.4420 0.958 289 198 866
## 17 49 8.37e-01 -0.4640 0.955 270 177 834
## 18 52 8.58e-01 -0.4820 0.953 253 159 804
## 19 55 8.63e-01 -0.4990 0.933 238 145 777
## 20 58 8.81e-01 -0.5160 0.931 225 132 751
network = datExpr_redDim %>% t %>% blockwiseModules(power=30, numericLabels=TRUE)
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)
It leaves 348 genes without a cluster:
clusterings[['WGCNA']] %>% table
## .
## 0 1 2 3 4 5
## 348 1977 673 140 124 123
Number of clusters that resemble more Gaussian mixtures = 34 but 23 is the second one, so chose that one because it’s smaller.
n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=50, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')
best_k = 23
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)
Plot of clusters with their centroids in gray
gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())
Trying with 16 clusters (16 has the 12th lowest value)
best_k = 16
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM_2']] = gmm$Log_likelihood %>% apply(1, which.max)
clusterings[['GMM_2']] %>% table
## .
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 69 184 269 303 203 269 253 177 171 149 100 195 301 418 161 163
Trying with 8 clusters
best_k = 8
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM_3']] = gmm$Log_likelihood %>% apply(1, which.max)
clusterings[['GMM_3']] %>% table
## .
## 1 2 3 4 5 6 7 8
## 290 410 348 655 313 413 524 432
Separate the two clouds of points by a straight line. There seems to be a difference in the mean expression of the genes between clusters but not in their standard deviation.
manual_clusters = as.factor(as.numeric(-0.1*datExpr_redDim$PC1 + 0.05 > datExpr_redDim$PC2))
datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2, color=manual_clusters)) + geom_point() +
geom_abline(slope=-0.1, intercept=0.05, color='gray') + theme_minimal()
names(manual_clusters) = rownames(datExpr_redDim)
clusterings[['Manual']] = manual_clusters
Cluster 1 has a bigger mean expression than cluster 2. There also seem to be more than one distributions in cluster 1, with two different means and three different standard deviations?
manual_clusters_data = cbind(apply(datExpr_redDim, 1, mean), apply(datExpr_redDim, 1, sd),
manual_clusters) %>% data.frame
colnames(manual_clusters_data) = c('mean','sd','cluster')
manual_clusters_data = manual_clusters_data %>% mutate('cluster'=as.factor(cluster))
p1 = manual_clusters_data %>% ggplot(aes(x=mean, color=cluster, fill=cluster)) +
geom_density(alpha=0.4) + theme_minimal()
p2 = manual_clusters_data %>% ggplot(aes(x=sd, color=cluster, fill=cluster)) +
geom_density(alpha=0.4) + theme_minimal()
grid.arrange(p1, p2, ncol=2)
rm(wss, datExpr_k_means, h_clusts, cc_output, cc_output_c1, cc_output_c2, best_k, ICA_output,
ICA_clusters_names, signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network,
best_power, c, manual_clusters, manual_clusters_data)
Using Adjusted Rand Index:
Clusterings seem to give very different results and none resembles the manual separation
Simple methods give similar results (K-means, hierarchical clustering, consensus clustering)
cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
cluster1 = as.factor(clusterings[[i]])
for(j in (i):length(clusterings)){
cluster2 = as.factor(clusterings[[j]])
cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
}
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)
cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none',
cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE,
cexRow = 1, cexCol = 1, margins = c(7,7))
rm(i, j, cluster1, cluster2, cluster_sim)
create_2D_plot = function(cat_var){
ggplotly(plot_points %>% ggplot(aes_string(x='PC1', y='PC2', color=cat_var)) +
geom_point() + theme_minimal() +
xlab(paste0('PC1 (', round(summary(pca_output)$importance[2,1]*100,2),'%)')) +
ylab(paste0('PC2 (', round(summary(pca_output)$importance[2,2]*100,2),'%)')))
}
create_3D_plot = function(cat_var){
plot_points %>% plot_ly(x=~PC1, y=~PC2, z=~PC3) %>% add_markers(color=plot_points[,cat_var], size=1) %>%
layout(title = glue('Samples coloured by ', cat_var),
scene = list(xaxis=list(title=glue('PC1 (',round(summary(pca_output)$importance[2,1]*100,2),'%)')),
yaxis=list(title=glue('PC2 (',round(summary(pca_output)$importance[2,2]*100,2),'%)')),
zaxis=list(title=glue('PC3 (',round(summary(pca_output)$importance[2,3]*100,2),'%)'))))
}
plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
mutate(ID = rownames(.), km_clust = as.factor(clusterings[['km']]),
hc_clust = as.factor(clusterings[['hc']]), cc_l1_clust = as.factor(clusterings[['cc_l1']]),
cc_clust = as.factor(clusterings[['cc_l2']]), ica_clust = as.factor(clusterings[['ICA_min']]),
n_ica_clust = as.factor(rowSums(ICA_clusters)), gmm_clust = as.factor(clusterings[['GMM']]),
gmm_clust2 = as.factor(clusterings[['GMM_2']]), gmm_clust3 = as.factor(clusterings[['GMM_3']]),
wgcna_clust = as.factor(clusterings[['WGCNA']]), manual_clust=as.factor(clusterings[['Manual']]))
Simple methods seem to only partition the space in buckets using information from the first component
All clusterings except K-means manage to separate the two clouds at least partially, but no one does a good job
WGCNA creates clusters inverted between clouds
create_2D_plot('km_clust')
create_2D_plot('hc_clust')
create_2D_plot('cc_l1_clust')
create_2D_plot('cc_clust')
create_2D_plot('ica_clust')
create_2D_plot('gmm_clust')
create_2D_plot('gmm_clust2')
create_2D_plot('gmm_clust3')
create_2D_plot('wgcna_clust')
create_3D_plot('ica_clust')
create_3D_plot('gmm_clust')
create_3D_plot('gmm_clust3')
create_3D_plot('wgcna_clust')